Imports

all_annotations <-
  here::here("data", "4b-handle-long-annotations", "cleaned_annotations.csv") %>%
  read.csv() %>%
  rename(pandas_index = X) %>%
  mutate(
    annotation_id = factor(annotation_id),
    annotator = factor(
      annotator,
      levels = c("cborgelt",  "danielE",   "danielG",   "katharina", "sascha"),
      labels = c("Christian", "Daniel E.", "Daniel G.", "Katharina", "Sascha")
    ),
    line_id = factor(line_id),
    label = factor(
      label,
      levels = c("EVENT", "LOC",      "MISC",          "ORG",          "PER",    "TIME"),
      labels = c("Event", "Location", "Miscellaneous", "Organisation", "Person", "Time")
    ),
    
    annotation_length = end - start
  ) %>%
  filter(annotation_length <= 100)

all_annotations %>%
  select(-pandas_index, -annotation_id) %>%
  datatable()
union_annotations <-
  here::here("data", "5a-generate-union-dataset", "union_dataset.csv") %>%
  read.csv() %>%
  rename(pandas_index = X) %>%
  mutate(
    annotation_id = factor(annotation_id),
    line_id = factor(line_id),
    label = factor(label,
      levels = c("EVENT", "LOC",      "MISC",          "ORG",          "PER",    "TIME"),
      labels = c("Event", "Location", "Miscellaneous", "Organisation", "Person", "Time")
    ),
    
    annotation_length = end - start
  )

union_annotations %>% select(-pandas_index, -annotation_id)
doccano_colour_scheme_technical <- c(
  'ATTENTION'= '#f44e3b',
  'EVENT'= '#0062b1',
  'LOC'= '#7b64ff',
  'MISC'= '#a79c4f',
  'ORG'= '#16a5a5',
  'PER'= '#fcdc00',
  'Personal Bookmark'= '#fda1ff',
  'POSTCORR'= '#9f0500',
  'TIME'= '#73d8ff',
  '?'= '#cccccc'
)

doccano_colour_scheme_named <- c(
  'Event'= '#0062b1',
  'Location'= '#7b64ff',
  'Miscellaneous'= '#a79c4f',
  'Organisation'= '#16a5a5',
  'Person'= '#fcdc00',
  'Time'= '#73d8ff'
)
annotator_colour_scheme_technical = c(
  'danielE' = '#ff7f00',
  'katharina' = '#e4211c',
  'cborgelt' = '#377eb8',
  'danielG' = '#984ea3',
  'sascha' = '#4daf4a'
)
annotator_colour_scheme_named = c(
  'Daniel E.' = '#ff7f00',
  'Katharina' = '#e4211c',
  'Christian' = '#377eb8',
  'Daniel G.' = '#984ea3',
  'Sascha' = '#4daf4a'
)

Global Label Length

# create pie chart: https://r-graph-gallery.com/piechart-ggplot2.html
# relative frequencies: https://homepages.gac.edu/~anienow2/MCS_142/R/ggplot2-barchart.html --> set aes(x = "")
# manual category colours: https://statisticsglobe.com/change-colors-of-bars-in-ggplot2-barchart-in-r
# change legend's title: https://www.geeksforgeeks.org/how-to-change-legend-title-in-ggplot2-in-r/

global_label_length <-
  union_annotations %>%
  # compute annotation length per category
  group_by(label) %>%
  summarise(
    count = n(),
    total_length = sum(annotation_length),
    median_length = median(annotation_length),
    mean_length = mean(annotation_length),
    sd_length = sd(annotation_length)
  ) %>%
  mutate(
    label = factor(label)
  )

# order plots by global annotation length: https://sarahleejane.github.io/learning/r/2014/09/17/ordering-factors-by-another-column-with-R.html
global_label_length$label <-
  reorder(
    global_label_length$label,
    global_label_length$total_length
  )

global_label_length
global_label_length %>%
  # transform data
  pivot_longer(
    cols = c(count, total_length, mean_length, median_length),
    names_to = "measure"
  ) %>%
  mutate(
    measure = factor(
      measure,
      levels = c("count", "total_length",            "mean_length",               "median_length"),
      labels = c("Count", "Total Annotation Length", "Average Annotation Length", "Median Annotation Length")
    )
  ) %>%
  
  # sort nicely for debugging
  arrange(measure, label) %>%
  select(measure, label, value) %>%

  # plot data
  ggplot(aes(x = measure, y = value, fill = label)) +
  geom_bar(
    stat = "identity",
    position = "fill",
    col = I("black")
  ) +
  
  scale_fill_manual(values = doccano_colour_scheme_named) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "", y = "Realative Frequency", fill = "Label") +
  coord_flip() +
  theme_classic()

Here’s the same thing as a pie chart.

global_label_length %>%
  ggplot(aes(x = "", y = total_length, fill = label)) +
  geom_bar(
    stat = "identity",
    width = 1,
    col = I("black")
  ) +
  coord_polar("y", start = 0) +
  
  # make it pretty
  labs(fill = "Label") +
  scale_fill_manual(values = doccano_colour_scheme_named) +
  theme_void()

# https://stackoverflow.com/questions/54523133/adding-overall-group-to-facet-wrap-ggplot2#comment95851202_54523970 --> add "overall" column by setting 
#     facet_grid(~label, margins = TRUE)
#                        --------------

all_annotations %>%
  ggplot(aes(x = annotation_length, fill = label)) +
  geom_histogram(binwidth = 10, col = I("black")) +
  facet_grid(~label) +
  labs(
    x = "Annotation Length",
    y = "Count"
  ) +
  scale_fill_manual(values = doccano_colour_scheme_named) +
  guides(fill = FALSE) +
  theme_bw()

all_annotations %>%
  ggplot(aes(x = label, y = annotation_length, fill = label)) +
  geom_boxplot() +
  labs(
    x = "Label",
    y = "Annotation Length",
    fill = "Label"
  ) +
  scale_fill_manual(values = doccano_colour_scheme_named) +
  theme_bw()

annotator_contributions <-
  all_annotations %>%
  # compute annotation length per category
  group_by(annotator) %>%
  summarise(
    count = n(),
    total_length = sum(annotation_length),
    median_length = median(annotation_length),
    mean_length = mean(annotation_length),
    sd_length = sd(annotation_length)
  )

annotator_contributions
annotator_contributions %>%
  # transform data
  pivot_longer(
    cols = c(count, total_length, mean_length, median_length),
    names_to = "measure"
  ) %>%
  mutate(
    measure = factor(
      measure,
      levels = c("count", "total_length",            "mean_length",               "median_length"),
      labels = c("Count", "Total Annotation Length", "Average Annotation Length", "Median Annotation Length")
    )
  ) %>%
  
  # sort nicely for debugging
  arrange(measure, annotator) %>%
  select(measure, annotator, value) %>%

  # plot data
  ggplot(aes(x = measure, y = value, fill = annotator)) +
  geom_bar(
    stat = "identity",
    position = "fill",
    col = I("black")
  ) +
  
  scale_y_continuous(labels = scales::percent) +
  labs(x = "", y = "Realative Frequency", fill = "Annotator") +
  scale_fill_manual(values = annotator_colour_scheme_named) +
  coord_flip() +
  theme_classic()

all_annotations %>%
  ggplot(aes(x = annotation_length, fill = annotator)) +
  geom_histogram(
    binwidth = 10,
    col = I("black")
  ) +
  facet_grid(~annotator) +
  labs(
    x = "Annotation Length",
    y = "Count"
  ) +
  scale_fill_manual(values = annotator_colour_scheme_named) +
  guides(fill = FALSE) +
  theme_bw()

all_annotations %>%
  ggplot(aes(x = annotator, y = annotation_length, fill = annotator)) +
  geom_boxplot() +
  labs(
    x = "Annotator",
    y = "Annotation Length",
    fill = "Annotator"
  ) +
  scale_fill_manual(values = annotator_colour_scheme_named) +
  theme_bw()

Labels per Annotator

label_annotator <- all_annotations %>%
  group_by(annotator, label) %>%
  summarise(
    total_length = sum(annotation_length),
    count = n(),
    .groups = 'drop_last'
  ) %>%
  mutate(
    label = reorder(
      factor(label),
      global_label_length$total_length
    )
  )
# stacked bar chart: https://r-graph-gallery.com/48-grouped-barplot-with-ggplot2 --> position = "fill"

# plot it as stacked bar chart
label_annotator %>%
  ggplot(aes(x = annotator, y = total_length, fill = label)) +
  geom_bar(
    stat = "identity",
    position = "fill",
    col = I("black")
  ) +
  
  # make it pretty
  labs(x = "Annotator", y = "Relative Label Frequency", fill = "Label") +
  scale_fill_manual(values = doccano_colour_scheme_named) +
  scale_y_continuous(labels = scales::percent) +
  theme_classic()

Annotations per Label

# plot it as stacked bar chart
label_annotator %>%
  ggplot(aes(x = label, y = total_length, fill = annotator)) +
  geom_bar(
    stat = "identity",
    position = "fill",
    col = I("black")
  ) +
  
  # make it pretty
  labs(x = "Label", y = "Relative Label Frequency", fill = "Annotator") +
  scale_fill_manual(values = annotator_colour_scheme_named) +
  scale_y_continuous(labels = scales::percent) +
  theme_classic()

Combinations of Annotator and Label

all_annotations %>%
  ggplot(aes(x = annotation_length)) +
  geom_histogram(binwidth = 10) +
  facet_grid(annotator~label) +
  labs(
    x = "Annotation Length",
    y = "Count"
  ) +
  guides(fill = FALSE) +
  theme_bw()

all_annotations %>%
  ggplot(aes(x = "", y = annotation_length)) +
  geom_boxplot() +
  facet_grid(annotator~label) +
  labs(
    x = "",
    y = "Annotation Length"
  ) +
  theme_bw()